# AMAR ET AL. - COUNTERING MISINFORMATION EARLY (2025)
## REPLICATION FILE: 14_robust_attrition.R
### To alleviate concerns about non-random attrition, in this script, we conduct three analyses:
#### (1) Inverse-probability weighting (IPW) 
#### (2) Tipping point analysis
#### (3) Lee bounds
# ----
# Inverse-probability weighting ----
## Main endline ----
# Compute inverse probability weights
attrition_mod <- svyglm(
  endline_complete ~ treatment + treatment * (
    gender
    + class
    + age
    + religion_hindu_num
    + language_hindu_num
    + asset_index
    + fathers_education
    + mothers_education
    + school_gov_num
    + science
    + mobile_internet_num
    + trust_newspapers_num
    + trust_social_media_num
    + trust_tv_num
    + trust_friends_family_num
    + vaccinated_num
    + ayurveda_effective_num
  ),
  design = bimli_svy_dkr.rm)

# Set impute vars 
impute_vars <- attrition_predictors$iv

# Set up helper function for non-numeric vars
get_mode <- function(x) {
  ux <- unique(na.omit(x))
  ux[which.max(tabulate(match(x, ux)))]
}

# Compute predicted probabilities based on model
bimli_svy_dkr.rm$variables$ps_obs <- 
  predict.glm(attrition_mod, 
              newdata = bimli_svy_dkr.rm %>%
                mutate(across(all_of(impute_vars), ~ {
                  if (is.numeric(.x)) {
                    # median imputation
                    replace(.x, is.na(.x), median(.x, na.rm = TRUE))
                  } else {
                    # most‐common‐value imputation
                    replace(.x, is.na(.x), get_mode(.x))
                  }
                })),
              type = "response")

# Compute inverse probabilities
bimli_svy_dkr.rm <- bimli_svy_dkr.rm %>%
  mutate(
    ipw_att = 1/ps_obs)

# function to tabulate regressions for each dv
tab_regression_ipw <- function(varname, design) {
  fml <- as.formula(str_c(varname, "~treatment", "+ library_spillover_pre"))
  mod <- svyglm(fml, weights = ipw_att, design = design)
  out <- tidy(mod) %>%
    mutate(dv = varname, n = nobs(mod), .before = 1)
  out
}

# Main survey outcomes
regressions_ipw <- lapply(dv_labs$dv, \(x) tab_regression_ipw(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term)) %>% # remove intercepts and district FEs
  select(-term)

regressions_ipw <- left_join(dv_labs, regressions_ipw, by = "dv")

regressions_ipw %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE) %>% # omit questions that are used in a subindex
  mutate(label = if_else(str_detect(label, "Index"), "Index", label),
         label = as_factor(label),
         idx = as_factor(idx),
         is.idx = if_else(label == "Index", TRUE, FALSE),
         significance = ifelse(p.value < 0.05, "YES", "NO")) %>%
  ggplot(aes(x = estimate, y = fct_rev(label), color = is.idx, linetype = significance)) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point() +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) +
  facet_grid(rows = "idx", scales = "free_y", space = "free_y", switch = "y") + # facet by index
  xlab("Estimated effect of treatment in control-group SDs (Positive expected)") +
  scale_y_discrete(position = "right") +
  scale_color_manual(values = c("black", "red")) +
  scale_linetype_manual(values=c("dotted", "solid")) +
  theme_bw() %+replace%
  theme(axis.title.y = element_blank(),
        strip.text.y.left = element_text(angle = 0),
        legend.position = "none")
ggsave("output/figures/effects_ipw.pdf", height = 5, width = 8)

regressions_ipw %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE) %>% # omit questions that are used in a subindex
  select(label, n, estimate, std.error, p.value) %>%
  mutate(label = str_remove_all(label, " Index"),
         n = format(n, big.mark = ","),
         estimate = format(round(estimate, 2), nsmall = 2),
         std.error = round(std.error, 3),
         p.value = case_when(p.value > 0.9 ~ "$>$0.9",
                             p.value < 0.001 ~ "$<$0.001",
                             p.value < 0.01 ~ as.character(round(p.value, 3)),
                             TRUE ~ as.character(round(p.value, 2)))) %>%
  rename(Outcome = label,
         N = n,
         Estimate = estimate,
         SE = std.error,
         "p-value" = p.value) %>%
  xtable::xtable(caption = "ITT results with IPW", align = "llllll", digits = 3,
                 label = "tab:itt_inverse_prob_weighting") %>%
  print(include.rownames = FALSE, 
        caption.placement = "top",
        sanitize.text.function = identity, # add backslashes to special characters
        sanitize.colnames.function = \(x) str_c("\\textbf{", x, "}"),
        #add.to.row = list(pos = list(nrow(.)),
        #                 command = "\\hline\nP-values adjusted for \nFalse Discovery Rate (FDR) \nand using Bonferroni correction."), # add key for significance stars
        add.to.row = list(
          pos = list(nrow(.), nrow(.)), # Two lines added at the end of the table
          command = c(
            "\\hline\n\\multicolumn{5}{l}{\\footnotesize ITT effects using inverse probability weights to account for attrition.} \\\\\n",
            "\\multicolumn{5}{l}{\\footnotesize DV: Main Indices. Models include library-spillover FEs.} \\\\\n"
          )),
        hline.after = c(-1,0), # add horizontal lane after second to last column
        comment = FALSE,
        table.placement = getOption("xtable.table.placement", "h!"),
        file = paste0("output/tables/tab_itt_inverse_prob_weighting", ".tex"))

## Follow-up ----
### IPW Follow-Up
# Compute inverse probability weights
attrition_follow_mod <- svyglm(
  follow_reached ~ treatment + treatment * (
    gender
    + class
    + age
    + religion_hindu_num
    + language_hindu_num
    + asset_index
    + fathers_education
    + mothers_education
    + school_gov_num
    + science
    + mobile_internet_num
    + trust_newspapers_num
    + trust_social_media_num
    + trust_tv_num
    + trust_friends_family_num
    + vaccinated_num
    + ayurveda_effective_num
    + awareness
    + accuracy_discernment
    + sharing_discernment
    + health_preferences
    + source_discernment
    + engagement_attitude
    + engagement_behavior
  ),
  design = bimli_svy_dkr.rm)

# Set impute vars 
impute_vars_follow <- attrition_follow_predictors$iv

# Set up helper function for non-numeric vars
get_mode <- function(x) {
  ux <- unique(na.omit(x))
  ux[which.max(tabulate(match(x, ux)))]
}

# Compute predicted probabilities based on model
bimli_svy_dkr.rm$variables$ps_obs_follow <- 
  predict.glm(attrition_follow_mod, 
              newdata = bimli_svy_dkr.rm %>%
                mutate(across(all_of(impute_vars), ~ {
                  if (is.numeric(.x)) {
                    # median imputation
                    replace(.x, is.na(.x), median(.x, na.rm = TRUE))
                  } else {
                    # most‐common‐value imputation
                    replace(.x, is.na(.x), get_mode(.x))
                  }
                })),
              type = "response")

# Compute inverse probabilities
bimli_svy_dkr.rm <- bimli_svy_dkr.rm %>%
  mutate(
    ipw_att_follow = 1/ps_obs_follow)

# function to tabulate regressions for each dv
tab_regression_ipw_follow <- function(varname, design) {
  fml <- as.formula(str_c(varname, "~treatment", "+ library_spillover_pre"))
  mod <- svyglm(fml, weights = ipw_att_follow, design = design)
  out <- tidy(mod) %>%
    mutate(dv = varname, n = nobs(mod), .before = 1)
  out
}

# Follow-up survey outcomes
regressions_follow_up <- lapply(follow_up_dvs$dv, \(x) tab_regression_ipw_follow(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term)) %>% # remove intercepts and district FEs
  select(-term)
regressions_follow_up <- left_join(follow_up_dvs, regressions_follow_up, by = "dv")

# Main survey outcomes
regressions_ipw_follow <- lapply(follow_up_dvs$dv, \(x) tab_regression_ipw_follow(x, bimli_svy_dkr.rm)) %>% # make a list of regression tables
  bind_rows %>%
  mutate(margin = qnorm(0.975) * std.error,
         lower = estimate - margin,
         upper = estimate + margin) %>%
  filter(term != "(Intercept)" & !grepl("library_spillover_pre", term)) %>% # remove intercepts and district FEs
  select(-term)
regressions_ipw_follow <- left_join(follow_up_dvs, regressions_ipw_follow, by = "dv")

regressions_ipw_follow %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE) %>% # omit questions that are used in a subindex
  mutate(label = if_else(str_detect(label, "Index"), "Index", label),
         label = as_factor(label),
         idx = as_factor(idx),
         is.idx = if_else(label == "Index", TRUE, FALSE),
         significance = ifelse(p.value < 0.05, "YES", "NO")) %>%
  ggplot(aes(x = estimate, y = fct_rev(label), color = is.idx, linetype = significance)) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point() +
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) +
  facet_grid(rows = "idx", scales = "free_y", space = "free_y", switch = "y") + # facet by index
  xlab("Estimated effect of treatment in control-group SDs (Positive expected)") +
  scale_y_discrete(position = "right") +
  scale_color_manual(values = c("black", "red")) +
  scale_linetype_manual(values=c("dotted", "solid")) +
  theme_bw() %+replace%
  theme(axis.title.y = element_blank(),
        strip.text.y.left = element_text(angle = 0),
        legend.position = "none")
ggsave("output/figures/effects_follow_ipw.pdf", height = 5, width = 8)

regressions_ipw_follow %>%
  filter(is.na(subidx) & secondary_outcome == FALSE & mechanism == FALSE) %>% # omit questions that are used in a subindex
  mutate(
    label_rec = case_when(
      dv == "follow_accuracy_discernment" ~ "Accuracy Discernment Index",
      dv == "follow_accuracy_discernment_political" ~ "Political Discernment Index",
      dv == "follow_source_discernment" ~ "Source Discernment Index",
      TRUE ~ label)) %>%
  select(label_rec, n, estimate, std.error, p.value) %>%
  mutate(n = format(n, big.mark = ","),
         estimate = format(round(estimate, 2), nsmall = 2),
         std.error = round(std.error, 3),
         p.value = case_when(p.value > 0.9 ~ "$>$0.9",
                             p.value < 0.001 ~ "$<$0.001",
                             p.value < 0.01 ~ as.character(round(p.value, 3)),
                             TRUE ~ as.character(round(p.value, 2)))) %>%
  rename(Outcome = label_rec,
         N = n,
         Estimate = estimate,
         SE = std.error,
         "p-value" = p.value) %>%
  xtable::xtable(caption = "Second endline ITT results with IPW", align = "llllll", digits = 3,
                 label = "tab:itt_inverse_prob_weighting_follow") %>%
  print(include.rownames = FALSE, 
        caption.placement = "top",
        sanitize.text.function = identity, # add backslashes to special characters
        sanitize.colnames.function = \(x) str_c("\\textbf{", x, "}"),
        #add.to.row = list(pos = list(nrow(.)),
        #                 command = "\\hline\nP-values adjusted for \nFalse Discovery Rate (FDR) \nand using Bonferroni correction."), # add key for significance stars
        add.to.row = list(
          pos = list(nrow(.), nrow(.)), # Two lines added at the end of the table
          command = c(
            "\\hline\n\\multicolumn{5}{l}{\\footnotesize ITT effects using inverse probability weights to account for attrition.} \\\\\n",
            "\\multicolumn{5}{l}{\\footnotesize DV: Main Indices. Models include library-spillover FEs.} \\\\\n"
          )),
        hline.after = c(-1,0), # add horizontal lane after second to last column
        comment = FALSE,
        table.placement = getOption("xtable.table.placement", "h!"),
        file = paste0("output/tables/tab_itt_follow_up_inverse_prob_weighting", ".tex"))

# ----
# Tipping point analysis ----
tipping_point_analysis_df <- data.frame(
  outcome = character(),
  tipping_point_significant = numeric(),
  tipping_point_zero = numeric(),
  tipping_point_negative_significant = numeric(),
  stringsAsFactors = FALSE
)

covariates <- c(
  "gender", 
  "class", 
  "age", 
  "religion_hindu_num", 
  "language_hindu_num", 
  "asset_index", 
  "fathers_education", 
  "mothers_education", 
  "school_gov_num", 
  "science", 
  "mobile_internet_num", 
  "trust_newspapers_num", 
  "trust_social_media_num", 
  "trust_tv_num", 
  "trust_friends_family_num", 
  "vaccinated_num", 
  "ayurveda_effective_num", 
  "compliance_minimal")


tipping_point_analysis <- function(outcomes, design, alpha = 0.05, increment = 0.01) {
  # Loop over each outcome
  for (outcome in outcomes) {
    # Extract control group mean and standard deviation
    control_sd <- sd(design$variables[[outcome]][design$variables$treatment == "Spoken English"], na.rm = TRUE)
    
    #treatment_mean <- mean(design$variables[[outcome]][design$variables$treatment == "Media Literacy"], na.rm = TRUE)
    
    # Create a copy of the design object for manipulation
    modified_design <- design
    
    # Impute 
    data_for_imputation <- design$variables[, c(outcome, "treatment", covariates)]
    imputed_data <- mice(data_for_imputation, m = 1, method = "pmm", seed = 123, print = FALSE)
    
    # Assign imputed outcome data to modified design
    modified_design$variables[[outcome]] <- complete(imputed_data)[[outcome]]
    
    # Initialize variables
    tipping_significant <- FALSE
    tipping_zero <- FALSE
    tipping_negative_significant <- FALSE
    sd_offset <- 0 - increment
    
    # Placeholder for tipping points
    tipping_point_significant <- NA
    tipping_point_zero <- NA
    tipping_point_negative_significant <- NA
    
    while (!tipping_significant || !tipping_zero || !tipping_negative_significant) {
      # Increment the SD offset
      sd_offset <- sd_offset + increment
      
      cat(paste("Doing analysis for outcome:", outcome,"\n", "SD offset:", sd_offset, "\n"))
      
      modified_design$variables[[outcome]][is.na(design$variables[[outcome]] & design$variables$treatment == "Media Literacy")] <- complete(imputed_data)[[outcome]][is.na(design$variables[[outcome]] & design$variables$treatment == "Media Literacy")] - sd_offset * control_sd
      
      # Define the formula for regression
      fml <- as.formula(paste(outcome, "~ treatment + library_spillover_pre"))
      
      # Run the regression
      mod <- svyglm(fml, design = modified_design)
      
      # Extract the treatment effect estimate and its p-value
      treatment_effect <- summary(mod)$coefficients["treatmentMedia Literacy", "Estimate"]
      p_value <- summary(mod)$coefficients["treatmentMedia Literacy", "Pr(>|t|)"]
      
      # Check for tipping points
      if (!tipping_significant && p_value > alpha) {
        tipping_point_significant <- sd_offset
        tipping_significant <- TRUE
      }
      
      if (!tipping_zero && treatment_effect <= 0) {
        tipping_point_zero <- sd_offset
        tipping_zero <- TRUE
      }
      
      if (!tipping_negative_significant && treatment_effect < 0 && p_value < alpha) {
        tipping_point_negative_significant <- sd_offset
        tipping_negative_significant <- TRUE
      }
    }
    
    # Add the tipping points to the results dataframe
    tipping_point_analysis_df <<- rbind(
      tipping_point_analysis_df,
      data.frame(
        outcome = outcome,
        tipping_point_significant = round(tipping_point_significant, 2),
        tipping_point_zero = round(tipping_point_zero, 2),
        tipping_point_negative_significant = round(tipping_point_negative_significant, 2)
      )
    )
  }
}

# Run function for all Index outcomes
dvs_tipping_point <- dv_labs %>%
  filter(type == "idx" & secondary_outcome == FALSE & mechanism == FALSE) %>%
  select(dv, label)

# To verify, uncomment lines below
# tipping_point_analysis(outcomes = dvs_tipping_point$dv,
#                        bimli_svy_dkr.rm,
#                        alpha = 0.05,
#                        increment = 0.01)
# write.csv(tipping_point_analysis_df, "data/cleaned/tipping_point_analysis_df.csv")

# Read in results because function takes a long time to run
## To verify results, uncomment the lines immediately above and re-run

tipping_point_analysis_df <-
  read.csv("data/cleaned/tipping_point_analysis_df.csv",
           stringsAsFactors = FALSE)

# Merge 
desired_order <- c(
  "accuracy_discernment",
  "sharing_discernment",
  "source_discernment",
  "health_preferences",
  "engagement_attitude",
  "engagement_behavior",
  "awareness")

tipping_point_tab <- dvs_tipping_point %>%
  left_join(tipping_point_analysis_df, by = c("dv" = "outcome")) %>%
  mutate(dv = factor(dv, levels = desired_order)) %>%
  arrange(dv) %>%
  mutate(tipping_point_significant = ifelse(tipping_point_significant == 0, NA_real_, tipping_point_significant),
         tipping_point_zero = ifelse(tipping_point_zero == 0, NA_real_, tipping_point_zero),
         tipping_point_negative_significant = ifelse(tipping_point_negative_significant == 0, NA_real_, tipping_point_negative_significant))

# Create latex table
tipping_point_tab_latex <- tipping_point_tab %>%
  mutate(across(everything(), ~ ifelse(is.na(.), "--", as.character(.)))) %>% # Replace NAs with "--"
  select(-dv, -X) %>%
  mutate(label = str_replace_all(label, " Index", "")) %>%
  dplyr::rename(
    "Outcome" = label,
    "Positive but Not Sign." = tipping_point_significant,
    "Estimate below 0" = tipping_point_zero,
    "Negative and Sign." = tipping_point_negative_significant
  ) %>%
  kableExtra::kbl(
    format = "latex",
    booktabs = TRUE,
    linesep = c(""),
    caption = "Tipping point analysis results",
    label = "tipping_points",
    position = "h!",
    align = c("l", "r", "r", "r") # Specify column alignment here
  ) %>%
  kableExtra::add_header_above(c(" " = 1, "Tipping points (in SD units)" = 3)) %>%
  kableExtra::footnote(
    general = "This table summarizes the tipping points at which the treatment effect loses statistical significance, reaches zero, and becomes significantly negative. The values represent how many (control-group) SD units below the imputed values the treatment group outcomes would need to be in each case. Models include library-spillover FEs.",
    threeparttable = TRUE, fixed_small_size = T, escape = FALSE, footnote_as_chunk = TRUE, general_title = "Note:"
  )
writeLines(tipping_point_tab_latex, here::here("output/tables/tab_tipping_point_table.tex"))

# ----
# Lee bounds ----
## Function ----
compute_lee_bounds_ranked <- function(svy_data,
                                      outcome_var,
                                      treat_var = "treatment",
                                      obs_var = "endline_complete",
                                      fe_var = NULL,
                                      cluster_var = NULL,
                                      seed = 123) {
  set.seed(seed)
  
  # Convert survey object to data.frame
  data <- as.data.frame(svy_data)
  
  # Filter to units with treatment and endline info
  valid_data <- data[!is.na(data[[treat_var]]) & !is.na(data[[obs_var]]), ]
  
  if (nrow(valid_data) == 0) {
    return(data.frame(outcome = outcome_var,
                      lower_bound = NA,
                      upper_bound = NA,
                      observed_diff = NA))
  }
  
  # Compute response rates
  p_treat <- mean(valid_data[[treat_var]] == 1 & valid_data[[obs_var]] == 1)
  p_ctrl  <- mean(valid_data[[treat_var]] == 0 & valid_data[[obs_var]] == 1)
  trim_share <- abs(p_treat - p_ctrl)
  
  # Use only non-missing outcomes with complete endline
  df_obs <- data[!is.na(data[[outcome_var]]) & data[[obs_var]] == 1, ]
  treat_grp <- df_obs[df_obs[[treat_var]] == 1, ]
  ctrl_grp  <- df_obs[df_obs[[treat_var]] == 0, ]
  
  if (nrow(treat_grp) == 0 || nrow(ctrl_grp) == 0) {
    return(data.frame(outcome = outcome_var,
                      lower_bound = NA,
                      upper_bound = NA,
                      observed_diff = NA))
  }
  
  # Compute number to trim
  n_trim <- floor(nrow(treat_grp) * trim_share)
  if (n_trim >= nrow(treat_grp)) {
    return(data.frame(outcome = outcome_var,
                      lower_bound = NA,
                      upper_bound = NA,
                      observed_diff = NA))
  }
  
  # Sort treated units by outcome
  treat_sorted <- treat_grp[order(treat_grp[[outcome_var]]), ]
  
  # Lower bound: drop highest values (assume trimmed are best-case)
  treat_lb <- head(treat_sorted, n = nrow(treat_sorted) - n_trim)
  
  # Upper bound: drop lowest values (assume trimmed are worst-case)
  treat_ub <- tail(treat_sorted, n = nrow(treat_sorted) - n_trim)
  
  # Regression helper
  run_reg <- function(df) {
    if (!is.null(fe_var)) {
      df[[fe_var]] <- as.factor(df[[fe_var]])
    }
    
    formula_str <- paste0(outcome_var, " ~ ", treat_var)
    if (!is.null(fe_var)) {
      formula_str <- paste0(formula_str, " + factor(", fe_var, ")")
    }
    formula <- as.formula(formula_str)
    
    model <- lm(formula, data = df)
    
    if (!is.null(cluster_var)) {
      vcov_mat <- vcovCR(model, cluster = df[[cluster_var]], type = "CR2")
    } else {
      vcov_mat <- vcovHC(model, type = "HC1")
    }
    
    coef <- coeftest(model, vcov = vcov_mat)
    if (!(treat_var %in% rownames(coef))) return(NA)
    return(coef[treat_var, 1])
  }
  
  # Run regressions
  est_obs <- run_reg(rbind(treat_grp, ctrl_grp))
  est_lb  <- run_reg(rbind(treat_lb, ctrl_grp))
  est_ub  <- run_reg(rbind(treat_ub, ctrl_grp))
  
  return(data.frame(
    outcome = outcome_var,
    lower_bound = round(est_lb, 3),
    upper_bound = round(est_ub, 3),
    observed_diff = round(est_obs, 3)
  ))
}

## Main endline ----
lee_results <- lapply(
  dv_labs_endline_main$dv[is.na(dv_labs_endline_main$subidx)],
  function(outcome) {
    tryCatch({
      compute_lee_bounds_ranked(
        svy_data = bimli_svy_dkr.rm,
        outcome_var = outcome,
        treat_var = "treatment_num",
        obs_var = "endline_complete",
        fe_var = "library_spillover_pre",
        cluster_var = "village"
      )
    }, error = function(e) {
      warning(paste("Error computing Lee bounds for", outcome, ":", e$message))
      data.frame(outcome = outcome, lower_bound = NA, upper_bound = NA, observed_diff = NA)
    })
  }
) %>% bind_rows()

# Now merge lee_results with dv_labs_endline_main to add the variable labels
lee_results <- merge(
  lee_results,
  dv_labs_endline_main[is.na(dv_labs_endline_main$subidx), c("dv", "label")],
  by.x = "outcome",
  by.y = "dv"
)

# Directly define the order in the arrange statement using match
lee_results <- lee_results %>%
  arrange(match(label, c("Accuracy Discernment", "Sharing Discernment", 
                         "Source Discernment", "Health Preferences", 
                         "Engagement Attitude", "Engagement Behavior", 
                         "Awareness")))

# Create latex table
lee_results_latex <- lee_results %>%
  mutate(across(everything(), ~ ifelse(is.na(.), "--", as.character(.)))) %>% # Replace NAs with "--"
  select(-outcome) %>%
  mutate(label = str_replace_all(label, " Index", "")) %>%
  # Reorder columns by selecting them in the desired order
  select(label, observed_diff, lower_bound, upper_bound) %>%
  dplyr::rename(
    "Outcome" = label,
    "Diff." = observed_diff,
    "Lower bound" = lower_bound,
    "Upper bound" = upper_bound
  ) %>%
  kableExtra::kbl(
    format = "latex",
    booktabs = TRUE,
    linesep = c(""),
    caption = "Lee bounds analysis results",
    label = "lee_bounds",
    position = "h!",
    align = c("l", "r", "r", "r") # Specify column alignment here
  ) %>%
  kableExtra::footnote(
    general = "This table summarizes the difference in means for observed outcomes and Lee bounds estimated using a rank-based trimming procedure. The lower bound assumes that attrited treated units would have had the best outcomes and trims the top of the treated outcome distribution. The upper bound assumes they would have had the worst outcomes and trims the bottom. The trimming proportion equals the difference in response rates between the treatment and control groups.",
    threeparttable = TRUE, fixed_small_size = T, escape = FALSE, footnote_as_chunk = TRUE, general_title = "Note:"
  )

writeLines(lee_results_latex, here::here("output/tables/tab_lee_results_table.tex"))

## Follow-up ----
lee_results_follow_up <- lapply(
  follow_up_dvs$dv,
  function(outcome) {
    tryCatch({
      compute_lee_bounds_ranked(
        svy_data = bimli_svy_dkr.rm,
        outcome_var = outcome,
        treat_var = "treatment_num",
        obs_var = "endline_complete",
        fe_var = "library_spillover_pre",
        cluster_var = "village"
      )
    }, error = function(e) {
      warning(paste("Error computing Lee bounds for", outcome, ":", e$message))
      data.frame(outcome = outcome, lower_bound = NA, upper_bound = NA, observed_diff = NA)
    })
  }
) %>% bind_rows()

# Update follow_up_dvs$label based on dv values
follow_up_dvs <- follow_up_dvs %>%
  mutate(label = case_when(
    dv == "follow_accuracy_discernment" ~ "Accuracy Discernment",
    dv == "follow_accuracy_discernment_political" ~ "Political Discernment",
    dv == "follow_source_discernment" ~ "Source Discernment",
    TRUE ~ label  # Keep original label for all other cases
  ))

# Now merge lee_results with dv_labs_endline_main to add the variable labels
lee_results_follow_up <- merge(
  lee_results_follow_up,
  follow_up_dvs[is.na(follow_up_dvs$subidx), c("dv", "label")],
  by.x = "outcome",
  by.y = "dv"
)


# Directly define the order in the arrange statement using match
lee_results_follow_up <- lee_results_follow_up %>%
  arrange(match(outcome, c("follow_accuracy_discernment", "follow_accuracy_true", 
                           "follow_accuracy_false", "follow_accuracy_discernment_political", 
                           "follow_discernment_political_true", "follow_discernment_political_false", 
                           "follow_source_discernment", "follow_source_discern_specific_good", "follow_source_discernment")))

# Create latex table
lee_results_latex_follow <- lee_results_follow_up %>%
  mutate(across(everything(), ~ ifelse(is.na(.), "--", as.character(.)))) %>% # Replace NAs with "--"
  select(-outcome) %>%
  # Reorder columns by selecting them in the desired order
  select(label, observed_diff, lower_bound, upper_bound) %>%
  dplyr::rename(
    "Outcome" = label,
    "Diff." = observed_diff,
    "Lower bound" = lower_bound,
    "Upper bound" = upper_bound
  ) %>%
  kableExtra::kbl(
    format = "latex",
    booktabs = TRUE,
    linesep = c(""),
    caption = "Lee bounds analysis results (second endline)",
    label = "lee_bounds_follow",
    position = "h!",
    align = c("l", "r", "r", "r") # Specify column alignment here
  ) %>%
  kableExtra::footnote(
    general = "This table summarizes the difference in means for observed outcomes and Lee bounds estimated using a rank-based trimming procedure. The lower bound assumes that attrited treated units would have had the best outcomes and trims the top of the treated outcome distribution. The upper bound assumes they would have had the worst outcomes and trims the bottom. The trimming proportion equals the difference in response rates between the treatment and control groups.",
    threeparttable = TRUE, fixed_small_size = T, escape = FALSE, footnote_as_chunk = TRUE, general_title = "Note:"
  )


writeLines(lee_results_latex_follow, here::here("output/tables/tab_lee_results_follow_table.tex"))







# END of 14_robust_attrition.R ----